% HYPOTHETICAL EXPERIMENT/SIMULATION
% N subjects followed up for t time en measured every dt years
% p1 the risk of developing disease when exposed
% p0 the risk of developing disease when not exposed
% e = probability of being exposed
% we assume that there is no loss to follow-up

%%%% Paper simulation scenario III: In scenario III we assumed that disease D increased the probability of being exposed by ten times, but that exposure E did not increase the risk of disease (i.e. reverse causation). 

clear all

N = 1000;
pe = 0.1;  % equal to pe in the article
RR = 5; % the relative risk associated with the exposure
p0 = 0.01;
p1 = RR*p0;
time = 10;
dt = 1;
de = 5; % getting the disease increases the probability of getting exposed to E 5 times
pde=de*pe;
time = time+1; 

%The subject array matrix wtih 5 layers: 
subject = zeros(N,time,5);
subject(:,:,1)=p0;

% Layer 5 of the subject matrix represents the probability of getting exposed
subject(:,:,5)=pe;

% Initialize the random number generator to make the results repeatable.
rng(0,'twister');

for s=1:N  % s is the subject number
 
  for t=1:dt:time-1
  lambdae=subject(s,t,5); % The probability of getting exposed
  lambdad=subject(s,t,1); % The probability of getting the disesae
  wte = -log(rand (1, 1))/lambdae; % Poisson waiting times follow an exponential distribution.
  wtd = -log(rand (1, 1))/lambdad; % Poisson waiting times follow an exponential distribution.    
  
  if (wte<dt)  % If the time of exposure is within dt than this individual  is 
      subject(s,t+1:time,2)=1;  % exposed until the end of follow-up
      subject(s,t+1:time,1)=p1; % And has a higher probability of getting the disease
  end 
  
  if (wtd<dt)   % If the time till disease is within dt than this individual has  
      subject(s,t+1:time,3)=1; % the disease
      subject(s,t+1:time,5)= pde; % and has de times higher probability of getting exposed
  end
   
  end
end


%%%%%% Path to informatics.jar
javaaddpath('C:\Users\Ahmad Aziz\Desktop\Papers\Causal_inference\infodynamics-dist-1.2.1/infodynamics.jar');

%% A 2-dimensional time-window of length tw from the cohort

tw = time-1;
tb=1; % The time at the beginning of the time-window

sourceArray0=subject(:,tb:tb+tw,2); % select the exposure row as source
destArray0=subject(:,tb:tb+tw, 3); % select the disease row as destination

%% make two large rows
sourceArray1=sourceArray0';
destArray1=destArray0';
sourceArray=sourceArray1(:)';
destArray=destArray1(:)';
k=10;

teCalc=javaObject('infodynamics.measures.discrete.TransferEntropyCalculatorDiscrete', 2, k);
teCalc.initialise();
% Since we have simple arrays of ints, we can directly pass these in:
teCalc.addObservations(sourceArray, destArray);
% Calculation of the TE:
result = teCalc.computeAverageLocalOfObservations();
fprintf('The transfer entropy is %.4f bits.\n', result);

cs = teCalc.computeSignificance(1000);
mean = getMeanOfDistribution(cs)
sd = getStdOfDistribution(cs)
tscore = getTSscore(cs) %Assuming the distribution is Gaussian, return a t-score for our observed measurement
pvalue = cs.pValue
% % t2=(result2-mean)/sd
dist=cs.distribution;
pdist = (100-invprctile(dist',result,2))/100  % pdist equals the probability of this value or higher 

% %%%% Bootrstrap cofidence intervals %%%%%%%%%%%%%%%%%% 
% global counter btci_dist
% counter = 0;
% h = @aziz_bootstrapci_bin_transferentropy;
% bci = bootci(100,{h,sourceArray', destArray'}, 'alpha', 0.05, 'type', 'bca')

[correlation,p]=corrcoef(sourceArray,destArray)
%rho=corr(sourceArray',destArray')
%Pearson_corr(s)=pr(1,2);

%%% Save data for performing a cross-tabulation and calculating the
%%% chi-square statistic: a cross-section of the cohort at the end of the follow-up period
ct = time;
exposure=subject(:,ct,2);
disease =subject(:,ct,3);

%save('aziz_paper_simulation_3.mat', 'exposure', 'disease');
save('aziz_paper_simulation_3_v2.mat', 'subject', 'exposure', 'disease');
% aziz_sim1=[sourceArray',destArray'];
% save('aziz_sim1.mat','aziz_sim1');

hist(dist);
[counts, bins] = hist(dist);
plot(bins, counts); %# get a line plot of the histogram

% tev is a vector containing the individual transfer entropies

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Perform logistic regression with all previous 'Ds' (or 'Es') and a constant as
% predictors and disease (or 'exposure') status at the end of follow-up as the outcome
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%STEP 1: Estimate how well the logistic regression with past values of E
%and D can predict the future of D at the final step.

% Model with all previous Ds as predictors
mdl =  fitglm(subject(:,2:time-1,3),disease,'linear','Distribution','poisson') % First column is constant (i.e. only zeros) and is thus ommitted
llh1=mdl.LogLikelihood
% aic = mdl.ModelCriterion.loglikelihood

% Model with all previous Ds and Es as predictors
ED=horzcat(subject(:,2:time-1,3),subject(:,2:time-1,2));
mdl2 =  fitglm(ED,disease,'linear','Distribution','poisson')
llh2=mdl2.LogLikelihood

% perform a log likelihood test for comparing the two models (with 9
% degrees of freedom)
chi2=2*(llh2-llh1);
p_mdl_1_2=1-chi2cdf(chi2,9)

%STEP 2: Estimate how well the logistic regression with past values of E
%and D can predict the future of E at the final step.

% Model with all previous Es as predictors
mdl3 =  fitglm(subject(:,2:time-1,2),exposure,'linear','Distribution','poisson') % First column is constant (i.e. only zeros) and is thus ommitted
llh3=mdl3.LogLikelihood
% aic = mdl.ModelCriterion.loglikelihood

% Model with all previous Ds and Es as predictors
ED=horzcat(subject(:,2:time-1,3),subject(:,2:time-1,2));
mdl4 =  fitglm(ED,exposure,'linear','Distribution','poisson')
llh4=mdl4.LogLikelihood

% perform a log likelihood test for comparing the two models (with 9
% degrees of freedom)
chi2=2*(llh4-llh3);
p_mdl_3_4=1-chi2cdf(chi2,9)


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Perform logistic regression with the penultimate 'D' (or 'E') and a constant as
% predictors and disease (or 'exposure') status at the end of follow-up as the outcome
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%STEP 1: Estimate how well the logistic regression with penultimate values of E
%and D can predict the future of D at the final step.

% Model with penultimate D as predictor
mdl5 =  fitglm(subject(:,time-1,3),disease,'linear','Distribution','poisson') % First column is constant (i.e. only zeros) and is thus ommitted
llh5=mdl5.LogLikelihood
% aic = mdl.ModelCriterion.loglikelihood

% Model with penultimate D and E as predictors
ED=horzcat(subject(:,time-1,3),subject(:,time-1,2));
mdl6 =  fitglm(ED,disease,'linear','Distribution','poisson')
llh6=mdl6.LogLikelihood

% perform a log likelihood test for comparing the two models (with 1
% degree of freedom)
chi2=2*(llh6-llh5);
p_mdl_5_6=1-chi2cdf(chi2,1)

%STEP 2: Estimate how well the logistic regression with penultimate values of E
%and D can predict the future of E at the final step.

% Model with penultimate E as predictor
mdl7 =  fitglm(subject(:,time-1,2),exposure,'linear','Distribution','poisson') % First column is constant (i.e. only zeros) and is thus ommitted
llh7=mdl7.LogLikelihood
% aic = mdl.ModelCriterion.loglikelihood

% Model with penultimate D and E as predictors
ED=horzcat(subject(:,time-1,3),subject(:,time-1,2));
mdl8 =  fitglm(ED,exposure,'linear','Distribution','poisson')
llh8=mdl8.LogLikelihood

% perform a log likelihood test for comparing the two models (with 1
% degrees of freedom)
chi2=2*(llh8-llh7);
p_mdl_7_8=1-chi2cdf(chi2,1)